# Title: 8-create-grid-cells.R
# Description: Create a panel dataset by grid cell. 
# Authors: Peter T. Leeson and Jacob W. Russ 

# library(readr)
# library(raster)
# library(dplyr)
# library(magrittr)
# library(tidyr)
p_load(maptools) # 
p_load(sp) # 
# library(rgdal)
# library(ggplot2)

# Import datasets -------------------------------------------------------------

euro_spdf   <- readOGR(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019","data","raw","Eurostat","resolution_60_mil","NUTS_RG_60M_2010.shp"),
                       p4s = "+init=epsg:4326",
                       # file says p4s is: +proj=longlat +ellps=GRS80 +no_defs;
                       # file CRS will be overridden, but that's fine;
                       # file is prob. ETRS89 (EU standard);
                       # WGS84 also based on GRS80 but with minor updates
                       GDAL1_integer64_policy = TRUE, stringsAsFactors=FALSE)

# GADM 2012
europe_adm2 <- read_csv(here::here("Witchtrial_diffusion","Leeson-Russ_EJ_2019","data","raw","GADM","europe_gadm_adm2.csv"))

# Calculate the centroid of the counties
centroids_adm2 <- europe_adm2 %>%
  dplyr::select(gadm.adm0 = NAME_0, gadm.adm1 = NAME_1, gadm.adm2 = id, 
         lon = long, lat) %>%
  group_by(gadm.adm0, gadm.adm1, gadm.adm2) %>%
  summarise(c.lon = mean(lon),
            c.lat = mean(lat)) %>% 
  ungroup()

# Recode United Kingdom countries to GADM 1 Regions ---------------------------
trials <- trials %>%
  mutate(gadm.adm0 = if_else(condition = gadm.adm0 %in% "United Kingdom", 
                             true      = gadm.adm1,
                             false     = gadm.adm0))

battles <- battles %>%
  mutate(gadm.adm0 = if_else(condition = gadm.adm0 %in% "United Kingdom",
                             true      = gadm.adm1,
                             false     = gadm.adm0))

centroids_adm2 <- centroids_adm2 %>%
  mutate(gadm.adm0 = if_else(condition = gadm.adm0 %in% "United Kingdom", 
                             true      = gadm.adm1,
                             false     = gadm.adm0))

# update here: Leeson and Russ overwrite NAs in lat/lon with county centroids because several trial data sources have no geo info:
trials %>% filter(is.na(lon)) %>% dplyr::select(record.source) %>% distinct()  
summary(trials$lat) # 5778 rows missing lat
# However, for some observations the data already contains correct lat/lon if matched on city, and using county centroids here produces incorrect locations.
# e.g., Bruges:
trials %>% filter(city=="Bruges") %>% dplyr::select(siteID, gadm.adm2, gadm.adm1, gadm.adm0, lon, lat, record.source)
table(trials$lat[trials$siteID==33], useNA="ifany")
table(trials$lon[trials$siteID==33], useNA="ifany")
# (ALSO, 1 Bruges obs from Kieckhefer (1976) was erroniously coded to GADM Adm 1 Wallonie instead of Vlaanderen - fixed at beginning of main script)
# trials %>% filter(city=="Bruges") %>% dplyr::select(record.source, gadm.adm2, gadm.adm1, gadm.adm0) %>% filter(gadm.adm1=="Wallonie") 
# centroids_adm2 %>% filter(gadm.adm2=="West-Vlaanderen") %>% dplyr::select(gadm.adm1, gadm.adm0, c.lat, c.lon) # They must have manually edited this, because West-Vlaanderen is not a county in Wallonie, and even if it was it shouldn't have an identical centroid to West-Vlaanderen in Vlaanderen. There is no mention of this in L&R Appendix E.
# Additionally, rounding error makes some coords different. The best coords are the ones with 6 total numeric values (excluding decimals). Many of the more-decimals seem to come from observations from data entry from Carlson (2004).
# E.g., Luxeuil-les-Bains
# trials %>% filter(city=="Luxeuil-les-Bains") %>% dplyr::select(siteID, gadm.adm2, gadm.adm1, gadm.adm0, lon, lat, record.source)
# table(trials$lat[trials$siteID==210]) # 47.8168 vs 47.8168409
# table(trials$lon[trials$siteID==210]) # 6.38111 vs 6.381111
# So, prior to merging centroids, 1) for simplicity's sake, round to 4 digits and 2) fill NAs where GADM and city info matches.
trials <- trials %>%
  mutate(lon = as.numeric(formatC(lon, width=7, format="f")), 
         lat = as.numeric(formatC(lat, width=6, format="f"))) %>%
  group_by(gadm.adm0, gadm.adm1, gadm.adm2, city) %>%
  fill(lon, lat, .direction = "updown") %>%
  ungroup()
# One Bruges obs still won't match with rounding, so manually change it
trials$lat[trials$siteID==33] <- 51.2094
trials$lon[trials$siteID==33] <- 3.2247
summary(trials$lat) # now 5762 rows missing lat: matched 16 rows

# Merge centroids onto trials
trials_w_centroids <- centroids_adm2 %>%
  left_join(x = trials, y = ., by = c("gadm.adm2", "gadm.adm1", "gadm.adm0")) %>%
  mutate(lon = if_else(is.na(lon) & !is.na(c.lon), c.lon, lon),
         lat = if_else(is.na(lat) & !is.na(c.lat), c.lat, lat))

summary(trials_w_centroids$lat) # N.B. 1037 of 10885 obs still missing coords
nrow(trials_w_centroids[trials_w_centroids$decade<1550,]) # 1049 pre-1550 obs
nrow(trials_w_centroids[trials_w_centroids$decade<1550&!is.na(trials_w_centroids$c.lon),]) # 814 pre-1550 with coords
nrow(trials_w_centroids[trials_w_centroids$decade>1490&trials_w_centroids$decade<1550&!is.na(trials_w_centroids$c.lon),]) # 279 between 1490 (first MM in 1487) and 1550 that have coords, or about 1/3


# Make events into point shapefiles
trials_coords  <- trials_w_centroids %>% filter(!is.na(lat)) %>% dplyr::select(lon, lat)
battles_coords <- battles %>% filter(!is.na(lat)) %>% dplyr::select(lon, lat)

trials_spdf <- trials_w_centroids %>%
  filter(!is.na(lat)) %>%
  SpatialPointsDataFrame(coords = trials_coords, 
                         data   = .,
                         proj4string = CRS("+init=epsg:4326"))

battles_spdf <- battles %>%
  filter(!is.na(lat)) %>%
  SpatialPointsDataFrame(coords = battles_coords,
                         data   = .,
                         proj4string = CRS("+init=epsg:4326"))

trials_clean <- data.frame(trials_spdf)

write_csv(trials_clean, here::here("Witchtrial_diffusion", "out_data", "trials.csv"))

# Convert to Euro equal area projection
crs_EPSG3035 <- "+proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs" # units in meters, not degrees
euro_spdf    %<>% spTransform(CRS(crs_EPSG3035))
trials_spdf  %<>% spTransform(CRS(crs_EPSG3035))
battles_spdf %<>% spTransform(CRS(crs_EPSG3035))

trials_laea <- trials_spdf %>% 
  data.frame() %>% # extract all the data to a non-spatial object
  dplyr::select(-lon, -lat, -optional) %>% # drop the WGS84 coordinate columns
  rename(lon=lon.1, lat=lat.1) # rename the EPSG3035 columns as x and y

write_csv(trials_laea, here::here("Witchtrial_diffusion", "out_data", "trials_laea.csv"))

battles_laea <- battles_spdf %>% 
  data.frame() %>% # extract all the data to a non-spatial object
  dplyr::select(-lon, -lat, -optional) %>% # drop the WGS84 coordinate columns
  rename(lon=lon.1, lat=lat.1) %>% # rename the EPSG3035 columns as x and y
  # ID for observation - names have spaces and non-alpha characters
  rowid_to_column(var = "battleID") %>%
  # a non-numeric location ID will be useful later for the distance table
  mutate(newID = paste("Btl", battleID, sep="_"))
  

write_csv(battles_laea, here::here("Witchtrial_diffusion", "out_data", "battles_laea.csv"))
# battles_laea <- read.csv(here::here("Witchtrial_diffusion", "out_data", "battles_laea.csv"), header=TRUE, stringsAsFactors=FALSE)

# Skip section making grids --------------------------------------
# This part is not necessary for our purposes. L&R gridded Europe and then measured trial incidence and battle incidence per grid. Noteworthy is that they measure incidence by decade per grid, and we can mimic this by calculating the number of battles and trials and demonology publications within the past 10 years.
# L&R dealt with having no 0 observations by grouping in 100km^2 grids, balancing their panel, and assigning "0" where quadrants had no trials/battles in a decade. But then they use ln(trials) and recode all 0 to NA (can't take the log of 0!). They do not specify how to handle NAs in their lm regressions, for which the default is "na.omit", which will omit rows that have a missing value for any model variables. 
# Since we want to take spatial dependence into account for demonological printing as well as for battles, we can use counts of battles within 100km and use counts scaled by distance for printing and for other trials.

# # Define SpatialGrid object in the same projection
# bb <- bbox(euro_spdf)
# cs <- c(1, 1) * 250000  # cell size 100km x 100km (for illustration)
# # Units for epsg:3035 = meters
# cc <- bb[ , 1] + (cs / 2)  # cell offset
# cd <- ceiling(diff(t(bb))/cs)  # number of cells per direction
# grd <- GridTopology(cellcentre.offset = cc, cellsize = cs, cells.dim = cd)
# grd
# 
# sp_grd <- SpatialGridDataFrame(grid        = grd,
#                                data        = data.frame(id = 1:prod(cd)),
#                                proj4string = CRS("+init=epsg:3035"))
# 
# poly_grid <- Grid2Polygons(sp_grd)
# 
# # (Skip) Keep only the grids that are over land --------------------------------
# poly_grid_subset <- raster::intersect(x = poly_grid, y = euro_spdf)
# 
# euro_df    <- fortify(euro_spdf)
# grid_df    <- fortify(poly_grid_subset)
# trials_df  <- data.frame(trials_spdf)
# battles_df <- data.frame(battles_spdf)
# 
# trials_pts_poly  <- point.in.poly(trials_spdf, poly_grid_subset)
# battles_pts_poly <- point.in.poly(battles_spdf, poly_grid_subset)
# 
# trials_by_grid <- trials_pts_poly@data %>%
#   as_data_frame() %>%
#   rename(grid.id = z) %>%
#   group_by(grid.id, decade) %>%
#   summarise(trials = sum(tried))
# 
# battles_by_grid <- battles_pts_poly@data %>%
#   as_data_frame() %>%
#   rename(grid.id = z) %>%
#   group_by(grid.id, decade) %>%
#   summarise(battles = n())
#
# combined <- poly_grid_subset@data %>%
#   as_data_frame() %>%
#   rename(grid.id = z) %>%
#   expand(grid.id, decade = seq(1300, 1850, 10)) %>%
#   left_join(y = trials_by_grid,  by = c("grid.id", "decade")) %>%
#   left_join(y = battles_by_grid, by = c("grid.id", "decade")) %>%
#   # Change "missing" battles to zeroes
#   replace_na(list(battles = 0, trials = 0)) %>%
#   # Add three "future" battles columns for the placebo test. For leads or lags 
#   # to work correctly, we need to sort the data frame and use only one grouping 
#   # variable. In this case use country name.
#   arrange(grid.id, decade) %>%
#   group_by(grid.id) %>%
#   mutate(ln.trials   = if_else(trials %in% 0, NA_real_, log(trials)),
#          ln1p.trials = log1p(trials),
#          battles.tp1 = lead(battles, 1),
#          battles.tp2 = lead(battles, 2),
#          battles.tp3 = lead(battles, 3)) %>%
#   ungroup() %>%
#   mutate(grid.id = factor(grid.id) %>% as.numeric)

# # Plot map of Europe as a check
# map <- ggplot() + 
#   geom_polygon(mapping = aes(x = long, y = lat, group = group), 
#                  fill    = "white", 
#                  colour  = "black", data = euro_df) +
#   coord_cartesian(xlim = c(2000000, 8000000), ylim = c(1000000, 5500000)) +
#   geom_point(mapping = aes(x = lon.1, y = lat.1),
#                size   = 2,
#                data   = trials_df) +
#   geom_point(mapping = aes(x = lon.1, y = lat.1),
#              size   = 2, colour = "red",
#              data   = battles_df) +
#   geom_polygon(mapping = aes(x = long, y = lat, group = group), 
#                fill    = NA, 
#                colour  = "black", data = grid_df)
#   
# map
# 

# New section measuring distances --------------------------------------

# NB. 424 battles at 362 locales:
# nrow(battles) #424 records
# battles %>% dplyr::select(city) %>% distinct(city) %>% nrow() #362 cities

battles_matrix <- battles_laea %>%
  group_by(year) %>% 
  summarize(battles_ann = n()) %>% # total battles in a year
  # before lag, make sure all years included
  complete(year=1300:1679) %>% # battles range from 1522 to 1654
  arrange(year) %>%
  replace_na(list(battles_ann = 0)) %>%
  mutate(battles_lag1=lag(battles_ann, n=1, default = NA)) %>% # lag by 1 year
  mutate(battles_sum5 = rollapplyr(battles_ann, width=list(-1:-5), sum, # 5 year moving sum
                              na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(battles_sum10 = rollapplyr(battles_ann, width=list(-1:-10), sum, # 10 year moving sum
                               na.rm=TRUE, fill=NA, align="right")) %>%
  mutate(battles_sum20 = rollapplyr(battles_ann, width=list(-1:-20), sum, # 10 year moving sum
                                    na.rm=TRUE, fill=NA, align="right"))

# don't run: for reference, write battles table to csv
#write.csv(battles_matrix,here::here("Witchtrial_diffusion", "out_data", "battles_matrix.csv"), row.names=TRUE)
# battles_matrix <- read_csv(here::here("Witchtrial_diffusion", "out_data", "battles_matrix.csv"))

# Add list columns of the place names of MM editions and the copy counts in time t and t-1:
battles_matrix <- battles_laea %>%
  dplyr::select(year, newID) %>%
  group_by(year) %>%
  nest(., Battles_0 = c(newID)) %>% # list-column recording battle IDs for years
  full_join(battles_matrix, by="year") %>% # join lists and sums together
  ungroup() %>%
  arrange(year) %>%
  mutate(Battles_1=lag(Battles_0, n=1, default = list(NULL))) # lag by 1 year

# flatten list-column contents from lists of tibbles to lists
Battles_1 <- vector("list", dim(battles_matrix)[1])
for (i in 1:dim(battles_matrix)[1]){
  flat.ids <- battles_matrix$Battles_1[i][[1]]
  flat.ids <- flat.ids[!sapply(flat.ids,is.null)]
  if(length(flat.ids)!=0){
    Battles_1[[i]] <- Reduce(rbind, flat.ids)
  } else {Battles_1[[i]] <- list(NULL)}
}
battles_matrix$Battles_1 <- Battles_1


# # Export grids to csv
# write_csv(combined, "data/clean/panel_dataset_grids.csv")

# can't do csv because of list-columns...
saveRDS(battles_matrix, here::here("Witchtrial_diffusion", "out_data", "battles_matrix.rds"))

